###### THE SUBNATIONAL ELECTORAL COERCION IN INDIA DATA SET, 1985-2015 ######### 

# Authors: Richetta, C., Harbers, I., & van Wingerden, E. 
# DOI of the article: 
# Script last checked on June 21st, 2023

library(tidyverse)
library(magrittr)
library(readxl)
library(sf)

#---- Open data set ----
setwd("") # set your working directory where all 4 SECI data sets are located
seci <- read_csv("seci-main-data-set.csv")

state <- seci %>%
  mutate(coercion = 1) %>%
  select(state_name, year, coercion) %>%
  group_by(state_name, year) %>%
  summarize(coercion = sum(coercion))

#---- FIGURE 1. Number of assembly constituencies affected by electoral coercion and its sub-types ----
fig1 <- seci %>%
  mutate(coercion = 1) %>%
  select(year, fraud, boycott, coercion) %>%
  group_by(year) %>%
  summarize(coercion = sum(coercion), fraud = sum(fraud), boycott = sum(boycott)) %>%
  mutate(year =  paste("01/01", as.character(year), sep="/") %>%
           as.Date("%d/%m/%Y")) %>%
  ggplot(aes(x=year)) +
  geom_line(aes(y=coercion, color="Overall", linetype="Overall"), linewidth=0.2) +
  geom_line(aes(y=fraud, color="Fraud",linetype="Fraud"), linewidth=0.2) +
  geom_line(aes(y=boycott, color="Boycott",linetype="Boycott"), linewidth=0.2) +
  scale_y_continuous(
    name = "Number of assembly constituency",
    limits = c(0, 501)
  ) + 
  theme_minimal() +
  theme(
    plot.margin = unit(x = c(-7, 0, 0, 0), units = "mm"),
    legend.position="top",
    legend.box.margin = unit(x = c(0,0,-5,0), units = "mm"),
    legend.title = element_blank(),
    legend.margin=margin(t = 0, unit='cm'),
    axis.text.x = element_text(size=5),
    axis.title.x = element_text(size=5),
    axis.text.y = element_text(size=5),
    axis.title.y = element_text(size=5),
    legend.text = element_text(size = 5)
  ) +
  ggtitle("") +
  scale_color_manual(name = "Coercion", 
                     values = c("Overall" = "#000000",
                                "Fraud" = "#999999",
                                "Boycott" = "#CCCCCC")) +
  scale_linetype_manual(name = "Coercion", values=c("Overall"="dotted", "Fraud"="solid", "Boycott"="solid")) +
  annotate(geom="text", x=as.Date("01/01/2012", "%d/%m/%Y"), y=476,
           label="Uttar Pradesh 2012", color="#333333", size=1.8) +
  annotate(geom="text", x=as.Date("01/01/2012", "%d/%m/%Y"), y=501,
           label="Manipur 2012", color="#333333", size=1.8) +
  annotate(geom="text", x=as.Date("01/01/2002", "%d/%m/%Y"), y=165,
           label="Punjab 2002", color="#333333", size=1.8) +
  annotate(geom="text", x=as.Date("01/01/1992", "%d/%m/%Y"), y=135,
           label="Punjab 1992", color="#333333", size=1.8) +
  annotate(geom="text", x=as.Date("01/01/1987", "%d/%m/%Y"), y=375,
           label="Bihar 1985", color="#333333", size=1.8)
fig1
ggsave("figures/figure1.jpeg", width = 8, height = 4, units = "cm", dpi=1200, bg="white")

#---- FIGURE 2. Percentage of assembly constituencies affected, by state and election number starting in 1985 ----
totalac <- read_csv("seci-election-information.csv")

fig2 <- state %>%
  right_join(totalac, by=c("state_name", "year")) %>%
  arrange(state_name, year) %>%
  mutate(coercion = if_else(is.na(coercion), 0, coercion)) %>%
  mutate(percent = (coercion*100)/total_constituencies) %>%
  mutate(sequence = as.character(row_number())) %>%
  select(state_name, sequence, percent) %>%
  ggplot(aes(x=sequence, y=state_name)) + 
  geom_tile(aes(fill=percent), color = "white",
            lwd = 0.4,
            linetype = 1) +
  theme_bw() +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        plot.margin = unit(x = c(0, 0, 0, 0), units = "mm"),
        legend.position="bottom",
        legend.box.margin = unit(x = c(-1,0,0,-1), units = "mm"),
        legend.margin=margin(t = 0, unit='cm'),
        legend.title = element_text(size=5),
        axis.text.x = element_text(size=5),
        axis.title.x = element_text(size=5),
        axis.text.y = element_text(size=5),
        axis.title.y = element_text(size=5),
        legend.text = element_text(size = 5)
  ) +
  scale_x_discrete(
    # Features of the first axis
    name = "Election number (from 1985 on)") +
  scale_y_discrete(
    limits=rev,
    name = "State") +
  geom_text(aes(label = round(percent, digits=2)), color = "white", size = 1.5) +
  scale_fill_gradient(low = "#DDDDDD", high = "#000000") +
  guides(fill = guide_colourbar(barwidth = 5,
                                barheight = 0.3,
                                title = "Percent"))
fig2
ggsave("figures/figure2.jpeg", width = 8, height = 8, units = "cm", dpi=1200, bg="white")


#---- FIGURE 3. Overall number of fatalities related to electoral violence, per year ----
# create data set with number of deaths by state and year in the district and state data sets
seci_stdt <- rbind(read_csv("seci-state-data-set.csv") %>%
                     select(state_name, year, death_toll), 
                   read_csv("seci-district-data-set.csv") %>%
                     select(state_name, year, death_toll)) %>%
  group_by(state_name, year) %>%
  summarize(death_toll = sum(death_toll))

# little sub data set to have information in which state/year there are most deaths
statedeath <- seci %>%
  group_by(year, state_name) %>%
  summarize(death_toll = sum(death_toll)) %>%
  arrange(desc(death_toll)) %>%
  full_join(seci_stdt %>%
              group_by(year, state_name) %>%
              summarize(death_toll = sum(death_toll)), by=c("state_name","year")) %>%
  rename(death_toll_ac = death_toll.x) %>%
  mutate(death_toll.y = if_else(is.na(death_toll.y), 0, death_toll.y)) %>%
  mutate(death_toll_full = death_toll_ac + death_toll.y) %>%
  arrange(desc(death_toll_full))
  

# let's go for the figure  
fig3 <- seci %>%
  group_by(year) %>%
  summarize(death_toll = sum(death_toll)) %>%
  full_join(seci_stdt %>%
              group_by(year) %>%
              summarize(death_toll = sum(death_toll)), by="year") %>%
  rename(death_toll_ac = death_toll.x) %>%
  mutate(death_toll.y = if_else(is.na(death_toll.y), 0, death_toll.y)) %>%
  mutate(death_toll_full = death_toll_ac + death_toll.y) %>%
  mutate(year = paste("01/01", as.character(year), sep="/") %>%
           as.Date("%d/%m/%Y")) %>%
  ggplot(aes(x=year)) +
  geom_line(aes(y=death_toll_full, color="All data sets fatalities"), size=0.2) +
  geom_line(aes(y=death_toll_ac, color="Assembly-level fatalities"), size=0.2) +
  scale_y_continuous(
    name = "Total number of fatalities",
    limits = c(0, 125)
  ) + 
  theme_minimal() +
  theme(
    plot.margin = unit(x = c(-5, 0, 0, 0), units = "mm"),
    legend.position="top",
    legend.box.margin = unit(x = c(0,0,-5,0), units = "mm"),
    legend.title = element_blank(),
    legend.margin=margin(t = 0, unit='cm'),
    axis.text.x = element_text(size=5),
    axis.title.x = element_text(size=5),
    axis.text.y = element_text(size=5),
    axis.title.y = element_text(size=5),
    legend.text = element_text(size = 5)
  ) +
  scale_color_manual(name = "Fatalities", values = c("All data sets fatalities" = "#CCCCCC",
                                                      "Assembly-level fatalities" = "#666666")) +
  ggtitle("") +
  annotate(geom="text", x=as.Date("01/01/1988", "%d/%m/%Y"), y=118,
           label="Tripura 1988", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1990", "%d/%m/%Y"), y=102,
           label="1990", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1990", "%d/%m/%Y"), y=107,
           label="Bihar", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1994", "%d/%m/%Y"), y=107,
           label="Punjab 1992", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1996", "%d/%m/%Y"), y=72,
           label="Assam 1996", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1996", "%d/%m/%Y"), y=77,
           label="J&K 1996", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/2014", "%d/%m/%Y"), y=43,
           label="J&K 2014", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/2008", "%d/%m/%Y"), y=46,
           label="Marahashtra 2009", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/2008", "%d/%m/%Y"), y=40,
           label="Orissa 2009", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1985", "%d/%m/%Y"), y=98,
           label="1985", color="#333333", size=1.6) +
  annotate(geom="text", x=as.Date("01/01/1985", "%d/%m/%Y"), y=103,
           label="Bihar", color="#333333", size=1.6)
fig3
ggsave("figures/figure3.jpeg", width = 8, height = 5, units = "cm", dpi=1200, bg="white")

#---- CROSS VALIDATION WITH DECO AND FIGURE ----
# download DECO data set in xlsx format from: https://ucdp.uu.se/downloads/index.html#deco
  # here the DECO data file is called "DECO_v.1.0.xlsx"
# download GADM India shapefile from: https://gadm.org/download_country.html 
  # here the GADM India file is called "gadm41_IND_1.shp"
# put the files in your working directory

#---- Mean, sd, min, max comparisons ----
# open SECI data sets and transform SECI data sets for comparison

agg_state_date <- function(dt) {dt %>%
    mutate(polling_start = str_replace_all(polling_start, "\\.", "/")) %>%
    mutate(polling_start = as.Date(polling_start, "%d/%m/%Y")) %>%
    mutate(collect_start = polling_start - 30) %>%
    mutate(result_date = ifelse(is.na(result_date), paste(polling_end), result_date)) %>%
    mutate(result_date = str_replace_all(result_date, "\\.", "/")) %>%
    mutate(result_date = as.Date(result_date, "%d/%m/%Y")) %>%
    mutate(collect_end = result_date + 30) %>%
    select(state_name, death_toll, collect_start, collect_end) %>%
    group_by(state_name, collect_start, collect_end) %>%
    summarise(death_toll = sum(death_toll)) %>%
    filter(collect_start>as.Date("1988-12-31"))
}

agg2_state_date <- function(dt) {dt %>%
    mutate(polling_start = str_replace_all(polling_start, "\\.", "/")) %>%
    mutate(polling_start = as.Date(polling_start, "%d/%m/%Y")) %>%
    mutate(collect_start = polling_start - 30) %>%
    mutate(result_date = ifelse(is.na(result_date), paste(polling_end), result_date)) %>%
    mutate(result_date = str_replace_all(result_date, "\\.", "/")) %>%
    mutate(result_date = as.Date(result_date, "%d/%m/%Y")) %>%
    mutate(collect_end = result_date + 30) %>%
    select(state_name, collect_start, collect_end) %>%
    mutate(death_toll=0) %>%
    filter(collect_start>as.Date("1988-12-31"))
}

df_seci <- rbind(agg_state_date(read_csv("seci-main-data-set.csv")), 
                 agg_state_date(read_csv("seci-district-data-set.csv")) %>%
                   rbind(., agg_state_date(read_csv("seci-state-data-set.csv"))) %>%
                   rbind(., agg2_state_date(read_csv("seci-election-information.csv")))) %>%
  group_by(state_name, collect_start, collect_end) %>%
  summarise(death_toll = sum(death_toll)) %>%
  rename(death_seci = death_toll)

# open DECO data set and transform for comparison 
# India shapefile
india_gadm <- st_read("gadm41_IND_1.shp")
#ggplot() +
#  geom_sf(data = india_gadm)
# st_crs(india_gadm) # epsg is 4326

# DECO data
deco <- readxl::read_xlsx("DECO_v.1.0.xlsx")
#deco_sf <- deco %>%
#  filter(country=="India" & latitude!="NA" & longitude!="NA") %>%
#  mutate(longitude = str_replace_all(longitude, "[a-z]|[A-Z]", "")) %>%
#  mutate(Y = as.numeric(latitude)) %>%
#  mutate(X = as.numeric(longitude)) %>%
#  select(id, best, high, low, electoral_vio_uncertainty, date_start, date_end, X, Y) %>%
#  st_as_sf(coords = c("X", "Y"), crs=4326)
#deco_sf
#ggplot() +
#  geom_sf(data = india_gadm) +     # Indian states
#  geom_sf(data = deco_sf, color = "red", size = 1)   # DECO events 
deco_join <- deco %>%
  filter(country=="India" & latitude!="NA" & longitude!="NA") %>%
  mutate(longitude = str_replace_all(longitude, "[a-z]|[A-Z]", "")) %>%
  mutate(Y = as.numeric(latitude)) %>%
  mutate(X = as.numeric(longitude)) %>%
  select(id, best, electoral_vio_uncertainty, date_start, date_end, X, Y,
         side_a, side_b, electoral_purpose, source_article) %>%
  st_as_sf(coords = c("X", "Y"), crs=4326) %>%
  st_join(india_gadm["NAME_1"]) %>%  
  mutate(NAME_1 = if_else(NAME_1=="Jammu and Kashmir", "Jammu & Kashmir", NAME_1)) %>%
  mutate(NAME_1 = if_else(NAME_1=="Odisha", "Orissa", NAME_1)) %>%
  mutate(NAME_1 = if_else(date_start<as.Date("2000-11-15") & NAME_1=="Jharkhand", "Bihar", NAME_1)) %>%
  mutate(NAME_1 = if_else(date_start<as.Date("2000-11-01") & NAME_1=="Chhattisgarh", "Madhya Pradesh", NAME_1)) %>%
  mutate(NAME_1 = if_else(date_start<as.Date("2000-11-09") & NAME_1=="Uttarkhand", "Uttar Pradesh", NAME_1)) %>%
  rename(state_name = NAME_1)

# join SECI and DECO together  
similarity_datasets <- function(dt, n) {df_seci %>%
    inner_join(dt, by="state_name") %>%
    filter(collect_end>date_end & collect_start<date_start & electoral_vio_uncertainty==0) %>%
    select(state_name, collect_start, collect_end, best) %>%
    group_by(state_name, collect_start, collect_end) %>%
    summarize(best = sum(best, na.rm = T)) %>%
    right_join(df_seci, by=c("state_name", "collect_start", "collect_end")) %>%
    mutate(best = if_else(is.na(best), 0, best)) %>%
    rename(death_deco = best) %>%
    filter(death_deco>=n | death_seci>=n) %>%
    mutate(seci_report = death_seci - death_deco)
} 

seci_deco_all <- similarity_datasets(deco_join, 0) %>%
  arrange(seci_report)

#---- FIGURE 4. Overall number of fatalities related to electoral violence in DECO and SECI, per year ----
fig4 <- seci_deco_all %>%
  ungroup() %>%
  select(-c(state_name, collect_end, seci_report)) %>%
  mutate(year = str_extract_all(as.character(collect_start), "[0-9]{4}")) %>%
  mutate(year = paste("01/01", as.character(year), sep="/") %>%
           as.Date("%d/%m/%Y")) %>%
  select(-collect_start) %>%
  group_by(year) %>%
  summarize(death_toll_deco = sum(death_deco),
            death_toll_seci = sum(death_seci)) %>%
  ggplot(aes(x=year)) +
  geom_line(aes(y=death_toll_deco, color="DECO fatalities"), linewidth=0.2) +
  geom_line(aes(y=death_toll_seci, color="SECI fatalities"), linewidth=0.2) +
  scale_y_continuous(
    name = "Total number of fatalities",
    limits = c(0, 120)
  ) + 
  theme_minimal() +
  theme(
    plot.margin = unit(x = c(-5, 0, 0, 0), units = "mm"),
    legend.position="top",
    legend.box.margin = unit(x = c(0,0,-5,0), units = "mm"),
    legend.title = element_blank(),
    legend.margin=margin(t = 0, unit='cm'),
    axis.text.x = element_text(size=5),
    axis.title.x = element_text(size=5),
    axis.text.y = element_text(size=5),
    axis.title.y = element_text(size=5),
    legend.text = element_text(size = 5)
  ) +
  scale_color_manual(name = "Fatalities", values = c("DECO fatalities" = "#CCCCCC",
                                                     "SECI fatalities" = "#666666")) +
  ggtitle("")
fig4
ggsave("figures/figure4.jpeg", width = 8, height = 5, units = "cm", dpi=1200, bg="white")

#---- qualitative evidences DECO events ---- 
seci_deco_all %>%
  filter(seci_report<(-9)) %>%
  slice(1:10)
# Chhattisgarh    2008-10-15    2009-01-07
# Jammu & Kashmir 2002-08-17    2002-11-09
# Jammu & Kashmir 2008-10-18    2009-01-27
# Uttar Pradesh   1991-04-20    1991-07-16

deco_quali <- deco_join %>%
  filter((state_name=="Chhattisgarh"  & date_start>as.Date("2008-10-15") &
            date_end<as.Date("2009-01-07")) 
         | (state_name=="Jammu & Kashmir"  & date_start>as.Date("2002-08-17") &
              date_end<as.Date("2002-11-09")) 
         | (state_name=="Jammu & Kashmir"  & date_start>as.Date("2008-10-18") &
              date_end<as.Date("2009-01-27")) 
         | (state_name=="Uttar Pradesh"  & date_start>as.Date("1991-04-20") &
              date_end<as.Date("1991-07-16")))
table(deco_quali$electoral_purpose) # 93 part of "spells" of violence
# 37 part of disrupt
table(deco_quali$side_a) # Government of India 113, Kashmir Insurgents 31
table(deco_quali$side_b) # Kashmir insurgents 105 

#---- qualitative evidences SECI events ----
seci_deco_all %>%
  filter(seci_report>=10) %>%
  arrange(seci_report) %>%
  slice(1:13)

agg_state_event <- function(dt) {dt %>%
    mutate(polling_start = str_replace_all(polling_start, "\\.", "/")) %>%
    mutate(polling_start = as.Date(polling_start, "%d/%m/%Y")) %>%
    mutate(collect_start = polling_start - 30) %>%
    mutate(result_date = ifelse(is.na(result_date), paste(polling_end), result_date)) %>%
    mutate(result_date = str_replace_all(result_date, "\\.", "/")) %>%
    mutate(result_date = as.Date(result_date, "%d/%m/%Y")) %>%
    mutate(collect_end = result_date + 30) %>%
    select(state_name, death_toll, collect_start, collect_end, event_description.1,
           event_description.2) %>%
    filter(collect_start>as.Date("1988-12-31") & death_toll>0)
}


df_seci_event <- rbind(agg_state_event(read_csv("seci-main-data-set.csv")), 
                       agg_state_event(read_csv("seci-district-data-set.csv"))) %>%
                         rbind(., agg_state_event(read_csv("seci-state-data-set.csv"))) %>%
  inner_join(seci_deco_all %>%
               filter(seci_report>=10), by=c("state_name", "collect_start"))
# explore the df_seci_event for the description of events that are not in DECO


################################# end of script ################################